
************************************
*********** PROJECT INFO ***********
************************************

// Record Linkage for Character-Based Names
// Clean 1880 names
// Author: Hannah Postel
// Date: 10/26/2022


***************************************
********* BASIC NAME CLEANING *********
***************************************

// 1880 census data downloaded from NBER
// Males born in China, Chinese race, 48 contiguous states only
// N=97,970

use "$data/1880_chinese.dta", clear

** SPECIFY MISSING NAMES **
// N=17,615
replace namefrst="." if namefrst=="---" | namefrst=="*" | namefrst=="" | namefrst=="UNKNOWN" 
// N=61,875
replace namelast="." if namelast=="---" | namelast=="*" | namelast=="" | namelast=="UNKNOWN"

*** DROP THOSE WITH BOTH NAMES MISSING ***
// Did not do this here for comparison with other matching methods, who do NOT drop missing names
// Recommend including this step in standalone analyses
// drop if namelast=="." & namefrst=="." // N=1,333

*** MANUAL CLEANING ***

// Mostly splitting up names that might be two characters
// Identified by name length and validated with enumeration forms
// Will submit to IPUMS for correction in the underlying data

// "First" names
replace namefrst="LING" if namefrst=="---LING"
replace namefrst="AH GOW" if namefrst=="AHGOW" | namefrst=="ANGOW"
replace namefrst="AH LOY" if namefrst=="ALLOY"
replace namefrst="AH TAKE" if namefrst=="AHTAKE"
replace namefrst="AH KINE" if namefrst=="AHKINE"
replace namefrst="AH CHUCK" if namefrst=="AHCHUCK"
replace namefrst="AH SONG" if namefrst=="ASONG"
replace namefrst="SEE LONG" if namefrst=="SEELONG"
replace namefrst="AH GOIN" if namefrst=="ANGOIN"
replace namefrst="AH GONG" if namefrst=="ANGONG"  | namefrst=="ANGONE"
replace namefrst="AH GOON" if namefrst=="ANGOON" | namefrst=="AHGOON"
replace namefrst="AH SING" if namefrst=="ASING"  | namefrst=="AHSING"
replace namefrst="AH LONG" if namefrst=="ALONG"
replace namefrst="CUN GONG" if namefrst=="CUNGONG"
replace namefrst="AH CHEW" if namefrst=="ACHEW"
replace namefrst="AH BEN" if namefrst=="AHBEN"
replace namefrst="AH HANG" if namefrst=="AHHANG"
replace namefrst="AH HONG" if namefrst=="AHONG"
replace namefrst="AH QUONG" if namefrst=="AHQUONG"
replace namefrst="AH SAM" if namefrst=="AHSAM"
replace namefrst="AH HUNG" if namefrst=="AHUNG"
replace namefrst="AH YOUNG" if namefrst=="AHYOUNG" | namefrst=="AYOUNG"
replace namefrst="AH YUP" if namefrst=="AHYUP"
replace namefrst="AH MING" if namefrst=="AMING"
replace namefrst="AH SHUE" if namefrst=="ASHUE"
replace namefrst="AH SICK" if namefrst=="ASICK"
replace namefrst="CHA YONG" if namefrst=="CHAYONG"
replace namefrst="CHE HONG" if namefrst=="CHEHONG"
replace namefrst="CHE YEW" if namefrst=="CHEYEW"
replace namefrst="CHUNG QUN" if namefrst=="CHUNGQUNN" | namefrst=="CHENQUUN"
replace namefrst="CHU SING" if namefrst=="CHUSING"
replace namefrst="ENG TOY" if namefrst=="ENGTOY"
replace namefrst="FOO KEE" if namefrst=="FOOKEE" | namefrst=="FOOKE"
replace namefrst="GUM HOP" if namefrst=="GUMHOP"
replace namefrst="HAN GEE" if namefrst=="HANGEE"
replace namefrst="HAN KEE" if namefrst=="HANKEE"
replace namefrst="HI SONG" if namefrst=="HISONG"
replace namefrst="HONG LING" if namefrst=="HONGLING"
replace namefrst="HOP SEE" if namefrst=="HOPSEE"
replace namefrst="LEE FOO" if namefrst=="LEEFOO"
replace namefrst="LANG GING" if namefrst=="LANGGING"
replace namefrst="LEE HOW" if namefrst=="LEHOW"
replace namefrst="LEE YOUNG" if namefrst=="LEYOUNG"
replace namefrst="LIN TOY" if namefrst=="LINTOY"
replace namefrst="LONG LOO" if namefrst=="LONGLOO"
replace namefrst="SING GEE" if namefrst=="SINGGEE"
replace namefrst="SING LOY" if namefrst=="SINGLOY"
replace namefrst="YANG TANG" if namefrst=="YANGTANG"
replace namefrst="YON SING" if namefrst=="YONSING"
replace namefrst="SAMUEL" if namefrst=="SAM'L"
replace namefrst="JIM OHARA" if namefrst=="JIM O'HARA"

// "Last" names
replace namelast="CHING LUNG" if namelast=="CHINGLUNG"
replace namelast="MON SOO" if namelast=="MONSOO"
replace namelast="TEN FOO" if namelast=="TENFOO"
replace namelast="LUI SING" if namelast=="LUISING"
replace namelast="LU GONG" if namelast=="LUGONG"
replace namelast="CHOU GE" if namelast=="CHOUGE"
replace namelast="FUK SUIE" if namelast=="FUKSUIE"
replace namelast="SUIN GUIE" if namelast=="SUINGUIE"
replace namelast="HE SING" if namelast=="HESING"
replace namelast="LE SING" if namelast=="LESING"
replace namelast="SE YONG" if namelast=="SEYONG"
replace namelast="LE WING" if namelast=="LEWING"
replace namelast="SING TIE" if namelast=="SINGTIE"
replace namelast="QE NING" if namelast=="QENING"
replace namelast="CHEN LEY" if namelast=="CHENLEY"
replace namelast="CHU RING" if namelast=="CHURING"
replace namelast="SOO FOO" if namelast=="SOOFOO"
replace namelast="ZO LONG" if namelast=="ZOLONG"
replace namelast="HOP KEY" if namelast=="HOPPKEE"
replace namelast="MAN CHEE" if namelast=="MANCHEE"
replace namelast="LO HING" if namelast=="LOHING"
replace namelast="GUI ONG" if namelast=="GUIONG"
replace namelast="LUE KAN" if namelast=="LUEKAN"
replace namelast="LI PEN" if namelast=="LIPEN"
replace namelast="LAY HO" if namelast=="LAYHO"
replace namelast="LEE KIN" if namelast=="LEEKIN"
replace namelast="SAM KE" if namelast=="SAMKE"
replace namelast="HEU KEE" if namelast=="HEUKEE"
replace namelast="WA KEE" if namelast=="WAKEE"
replace namelast="AH MIN" if namelast=="AHMIN"
replace namelast="SING WAN" if namelast=="SINGWAN"
replace namelast="YING KEE" if namelast=="YINGKEE"
replace namelast="WAH LEE" if namelast=="WAHLEE"
replace namelast="AH YOUNG" if namelast=="AHYOUNG"
replace namelast="SHU SUNG" if namelast=="SHUSUNG"
replace namelast="YOU SING" if namelast=="YOUSING"
replace namelast="YE NONG" if namelast=="YENONG"
replace namelast="LO YEE" if namelast=="LOYEE"
replace namelast="LO YONG" if namelast=="LOYONG"
replace namelast="SING LEE" if namelast=="SINGLEE"

// Manual replacements from enumeration pages
replace namefrst = "AH HAO-ONG KONG" if namefrst=="AH HO ONG KONG"
replace namefrst = "AH HONG SING HONG" if namefrst=="AH HONG SING HON"
replace namefrst = "." if namelast=="SING HI AH"
replace namefrst = "SEE TOO CHEE SING" if namefrst=="SEE TOO CHEE SIN"
replace namefrst = "SEE TOO WING CHONG" if namefrst=="SEE TOO WING CHO"
drop if namefrst == "CHAN CHOW AH GOE"  // Crossed out on enumeration page
replace namelast = "AH" if namelast=="VAN DE CARR" & namefrst=="CHARLEY" & age==19 
replace namelast = "LEE TU ACHE" if namelast=="LE TU ACHE" 

*** REPLACE SPECIALS WITH SPACES/BLANKS ***
replace namefrst = subinstr(namefrst, "-", " ",.)
replace namefrst = subinstr(namefrst, "(", "",.) 
replace namefrst = subinstr(namefrst, ")", "",.) 
replace namefrst = subinstr(namefrst, "?", "",.)
replace namefrst = subinstr(namefrst, "*", "",.)
replace namefrst = subinstr(namefrst, ",", "",.)
replace namefrst = subinstr(namefrst, "'", "",.)
replace namefrst = subinstr(namefrst, ".", "",.) if namefrst!="."

replace namelast = subinstr(namelast, "-", " ",.) 
replace namelast = subinstr(namelast, "*", "",.) 
replace namelast = subinstr(namelast, "(", "",.) 
replace namelast = subinstr(namelast, ")", "",.) 
replace namelast = subinstr(namelast, "?", "",.)
replace namelast = subinstr(namelast, ",", "",.)
replace namelast = subinstr(namelast, ".", "",.) if namelast!="."


*** REMOVE 'NUMBERED' NAMES ***
replace namefrst = subinstr(namefrst, "NO TWO", "",.)
replace namefrst = subinstr(namefrst, "NO II", "",.)
replace namefrst = subinstr(namefrst, "NO ON", "",.)
replace namefrst = subinstr(namefrst, "NO TW", "",.)
replace namefrst = subinstr(namefrst, "NO TH", "",.)

*** TRIM ADDITIONAL SPACES ***
// Internal
replace namefrst = stritrim(namefrst)
replace namelast = stritrim(namelast)
// External (leading and trailing)
replace namefrst = strtrim(namefrst)
replace namelast = strtrim(namelast)

*** FORMATTING "AH" HONORIFIC ***
replace namefrst = subinstr(namefrst, "A H", "AH",.)
replace namefrst = subinstr(namefrst, "A ", "AH ",.)
replace namefrst = "AH" if namefrst=="A"
replace namefrst = "AH" if namefrst=="AA"
replace namefrst = subinstr(namefrst, "AAH ", "AH ",.)

*** DROP NON-REAL NAMES ***
// Most with "China" as a name fragment have either 1 or 0 additional fragments
// Clean those that do have full names
replace namefrst="AH HONG" if namefrst=="AH HONG CHINA"
replace namefrst="AH LIE" if namefrst=="AH LIE CHINA"
replace namefrst="HIP SOM" if namefrst=="CHINAMAN HIPSOM"

drop if strpos(namelast, "CHINA") > 0  // 211 obs
drop if strpos(namefrst, "CHINA") > 0 // 295 obs

// Various other non-real names
drop if strpos(namelast, "CANTON") > 0
drop if strpos(namelast, "CHINEE") > 0
drop if strpos(namelast, "@CHINESECHARACTE") > 0
drop if strpos(namelast, "CHINESE") > 0
drop if strpos(namefrst, "CHINESE") > 0
drop if strpos(namefrst, "CHINEE") > 0
drop if strpos(namefrst, "CHINEYMAN") > 0
drop if strpos(namefrst, "NUMBER") > 0

*** STANDARDIZE AMERICANIZED SPELLINGS ***
replace namefrst="CHARLIE" if namefrst=="CHARLEY" | namefrst=="CHARLY" | namefrst=="CHARLI" 
replace namefrst="CHARLES" if namefrst=="CHAS"
replace namelast="CHARLIE" if namelast=="CHARLEY" | namelast=="CHARLY" | namelast=="CHARLIS" 



// This is the "light" clean for the baseline match
// N=97,428
save "$data/1880_chn_clean.dta", replace


*****************************************
************** SEGMENTATION *************
*****************************************

*** PREPARING FOR SEGMENTATION ***

// Create indicators for whether each name has a space
gen namefrst_mult =  strpos(namefrst, " ") > 0
gen namelast_mult =  strpos(namelast, " ") > 0

replace namefrst="" if namefrst=="."
replace namelast="" if namelast=="."

// Variables noting the number of spaces
gen nspace_frst = length(namefrst) - length(subinstr(namefrst," ", "", .))
gen nchar_frst=(nspace_frst+1) if namefrst!=""
replace nchar_frst=0 if nchar_frst==.

gen nspace_last = length(namelast) - length(subinstr(namelast," ","", .))
gen nchar_last=(nspace_last+1) if namelast!=""
replace nchar_last=0 if nchar_last==.

gen nspace_tot = nspace_frst + nspace_last
gen nchar_tot = nchar_frst + nchar_last

*** SEGMENTING "FIRST" NAMES ***

split namefrst, gen(name)

// Fix two-character surnames
replace name1="SOOHOO" if ((name1=="SOO" & name2=="HOO") | (name1=="SEE" & name2=="HOO") | (name1=="SOO" & name2=="HOP") | (name1=="SEE" & name2=="HO") | (name1=="SEE" & name2=="TOO") | (name1=="SU" & name2=="HOO"))
replace name1= "OWYANG" if ((name1=="OW" & name2=="YON") | (name1=="AU" & name2=="YONG"))
replace name2=name3 if (name1=="SOOHOO" | name1=="OWYANG") & name3!=""
replace name3=name4 if (name1=="SOOHOO" | name1=="OWYANG")

// Fill in blanks with missing dot
// Working to "fill" all blank spots
replace name2="." if namelast=="" & name2==""
replace name3="." if namelast=="" & name3==""
replace name4="." if namelast=="" & name4==""

*** SEGMENTING "LAST" NAMES ***
split namelast, gen(othername)
order namefrst name1-name4 namelast othername1-othername4

// Fix two-character surnames
replace othername1="SOOHOO" if (othername1=="SOO" & othername2=="FOO")
replace othername1= "OWYANG" if (othername1=="OU" & othername2=="YONG")
replace othername2=othername3 if (othername1=="SOOHOO" | othername1=="OWYANG")

*** REASSIGN NAME FRAGMENTS TO NEW VARS ***

// Filling in blanks
replace name1=othername1 if name1=="" & othername1!=""

// 1-fragment name in both locations
replace name2=othername1 if nspace_tot==0 & nchar_tot==2

// Blank "first" names
replace name2=othername2 if name2=="" & namefrst=="" & othername2!=""
replace name3=othername3 if name3=="" & namefrst=="" & othername3!=""

// Fill in missing values
replace name3="." if nchar_tot<3
replace name4="." if nchar_tot<4

// 3-fragment names
// 2-fragment "first" + 1-fragment "last"
replace name3=othername1 if name3=="" & nchar_frst==2 & nchar_tot==3
// 1-fragment "first" + 2-fragment "last" (fragment 1)
replace name2=othername1 if name2=="" & nchar_frst==1 & nchar_tot==3
// 1-fragment "first" + 2-fragment "last" (fragment 2)
replace name3=othername2 if name3=="" & nchar_frst==1 & nchar_tot==3

// 4-fragment names
// 2-fragment + 2-fragment (fragment 1)
replace name3=othername1 if name3=="" & nchar_frst==2 & nchar_tot==4
// 2-fragment + 2-fragment (fragment 2)
replace name4=othername2 if name4=="" & nchar_frst==2 & nchar_tot==4
// 3-fragment + 1-fragment
replace name4=othername1 if name4=="" & nchar_frst==3 & nchar_tot==4
// 4-fragment + 0-fragment
replace name4=othername4 if othername4!=""

// Formatting
replace name1 = subinstr(name1, ".", "",.)
replace name2 = subinstr(name2, ".", "",.) if name1!="SOOHOO"
replace name2 = "SOOHOO" if name1=="SOOHOO" & othername1=="SOOHOO"
replace name4="." if name4==""
replace name2="." if name2==""
replace name1="." if name1==""

order name1-name4 namefrst namelast othername1-othername4
sort name1 name2 name3 name4

// Save final segmented dataset
save "$data/1880_chn_finalnames.dta", replace
